{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=</td>\n",
       "      <td>2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=</td>\n",
       "      <td>3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                           msno  \\\n",
       "0  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "1  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "2  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "3  Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=   \n",
       "4  FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=   \n",
       "\n",
       "                                        song_id  target  \n",
       "0  BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=       1  \n",
       "1  bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=       1  \n",
       "2  JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=       1  \n",
       "3  2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=       1  \n",
       "4  3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=       1  "
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取数据\n",
    "dtype={\"msno\": object,\"song_id\": object,\"target\": int}\n",
    "dpath = '../Data/'\n",
    "df_kkbox = pd.read_csv(dpath+'train.csv', dtype=dtype,usecols=[col for col in dtype])\n",
    "df_kkbox.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_clicked = df_kkbox[df_kkbox['target']==1]\n",
    "df_unclick = df_kkbox[df_kkbox['target']==0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_subscriber_msnos = df_clicked.groupby(['msno','song_id']).sum()\n",
    "df_unclick_msnos = df_unclick.groupby(['msno','song_id']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=</th>\n",
       "      <th>+JGuj3rm4FBs8loN7rvI+JZ+EX3K9+WaxbDtmjs6mQc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+MRnGH0Gg7jA7izLFRU1SZtGPmWHdsWTeL9wRXChnRA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+s9ez4bdaiqa+RSSGU7ANmC83+f+zS1QtnbnuoDkxlc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>/6kgyFt1nsLDNbZh0AHM4Pb3sbp3CZ0C4DbPa4++eVE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0DQLNH6yvG4LUtNy+ko7mSHx+2vKSEWgWVZrZR0hJeE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0Ge/KaQyOshftregia3O17tJVxCneIpU7tgfkWRXFOo=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0TZ4BKn9YES2upK9ZgKgsAUtUClVKHnct79pnA/iPiQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0cs1Hj5EEWtTIKnL3OQ2G1XhsD+oLfBneiKBRoTHJ10=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0dkoI2pTRTUQHef2VTAt7d4Pf59fjHtF7PaHA6SMKEk=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0vBTdkfbCDjBihoqg+UAEIkuFS+6WUmTpgv+iPHP9fY=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1+hRCbg0uUki4rpsZlxlO8GZYruSPSEnFSsWxjyIBX8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>156jveBrNsBiAvX9W5/Bhcq330scjrczYYGMjx7C6f8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1FWFwb2xgSrjkZ1sH8NFb+H55U/rv0tWH7HCyp3RQVg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22wN6a2Q/qVQosA8OBfVg6AbwvI2KJnZ35DXLqpLHJg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2HzD9vqNzCXCs1FG7bCLMdVuHhy+dPmA3kR6sxczoHg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2MRXrIB8hS/wlIIfCI0zOofjWh00gIPcY7U0nkvt0Do=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2Y7NstzrTnCFlRel5m80hPDe9471QRU9oyqPIx1xIP0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2lHnMx+RWbPYv6NqByyQLM9z+gR7wR2NX1v+xTMO3iY=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2oNvM+pm2OrgVOR6ogzOWpDFSgM34W2GKyuj9s9cnTU=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2xyj32lIiLxSjZyDtITN1INnuRoMHDqLf0/DnEFRiXY=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35HZRjWgSVN2C5jI9dTRRfqnyu3xiuB+Q/BxNZfFgMc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35dx60z4m4+Lg+qIS0l2A8vspbthqnpTylWUu51jW+4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>364mdkfol3/iMgO0iKge+CvE55hKgimQMJ5GrxxwuRU=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39wBIgWcybJ0LlPeVmApMPmQToJ+iG3VxzQHepCh92A=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3NLnW15HfIOF+VCDsJ09OTAAl7URq9+rtioJts+l5k0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3XsgDUc1X0IprrHaX3IHV/eygRBC9cdP46M00NOgR8s=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3kevTU3qYXoDjCz3P6Zg6Z70T+qJHy0ogG+6ja+FOxA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3o0eoSASXAM5S54TkkkMYHgbg07Km3md+S4b8zjZDT4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3tej7/GZU1yjMOpASdP205Zyj5vS/ONWw118umvUQIs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"28\" valign=\"top\">zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=</th>\n",
       "      <th>jgR1t7eunTfEmalVEkEl9eV2L2c4Y6ci1+171CF6DEo=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>mXjOkV+pc7d91ctXLz4OgXTB7tEse/eAorkCvxNFHjY=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>nwEcF/fOvuPOOTYKG1h8N6VyDtbgzmyA2G7+N1L4qZ8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oDG5WCyUlb9i/Sr34Fljeh0YsONZKxl+7jjUw27vm5U=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>oF03xgIc9Z+hV316VfjIQ+obYwnv35u5YB3PD+6JaJM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ofqMCLCMtBo/sFM3e5NR3+lKn1qntdAGhcWGSHvF40o=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ooic+5zqkzuYoL4Py1JX18I4ZdMnOSFMHTh9+0O07Ao=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>p29OR54+PVxhlcC6Fmuuzoa+jeZKzh/EhJUJf6mvQbA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pfiKQT5IoBLAwScq3b1kNpUhj6GZy73yirxu0Hz6Jo4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pfnlCKRD1vpfe76haKcTthXKmSTaL2h8vlYDeoGpLWw=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qIrAqKtDPBACcsAjZWh9OXYSfGYzx8Wwcz5szbX/n+E=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qm+hOFUCV0HMX1mQpU5nv9zxLh7F7VjULF0KX3yL73M=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qm1G/0r9ZHsHskGuhIBOqKl1emPSQMAiVARG49LfMLI=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>quxZPlPUmS8QsJcSjHdn6zE1quBWk7w4ii3rLgOaawQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>r918Tutfx0ADBiR7FHX0aJFE6IvN6wQBNlaJCCTKA1I=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rFLawM8j/vW78o/S3P2GTsmy6HwkTI15P7NF/i9CkX4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rbDawE1MLdOzQv3xGb9sG7n2Lm6rRq+VAPXLi+/wSX0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sbeFHrrr6+2Vwq54A5bBUpFsQ6xkNoDc7fzqKLK1Mo8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>t1iz44Bys600zwNh4NMFFkwIHmea9UW+nbkN3lxfFo4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>t2ebdTJR77cxY9u9uDUSZ0UBnpSTwsDjx7nUPptVKr4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>truNpzPqG5vPHAayapIz8FTnpD2qs+UHS+KGJs6S5yw=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vUtVON1/U8ktsmq0q1WIlQnYHycT2AmqqpmRfcQx5k0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vdbUGlcRq3cB0XvkCYSTlMEVYMKEDlGauRA7iElyxXQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vldg6m5rQExUQNpVq0W/L/0VeDC6ktIFC/u2ezAudO4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wECvtbYxQG1OF5plm68OsbPSZnlZTaIEEbB9npniATw=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wq0/WJsu4L8YBFbn1r5gPVgbx/EcVP0R/GrQOHSvN5g=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>x4R8XJOLjg1x1cvqdCgJXkq0fUOIyU7JQyfzApEZMfs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zqDZjACUVfphX2Me6LEbMwDWLXA4bIWCbSSD+QsIypQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"2\" valign=\"top\">zzzRi5ek1YCKTGns8C77xwAutE05PAPmz8T/pIIQhzE=</th>\n",
       "      <th>rqNd52MxQ+okfR61j2c7u36DLODO4d2Zt/hMm3jxncM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wBTWuHbjdjxnG1lQcbqnK4FddV24rUhuyrYLd9c/hmk=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3714656 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                           target\n",
       "msno                                         song_id                                             \n",
       "++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU= +JGuj3rm4FBs8loN7rvI+JZ+EX3K9+WaxbDtmjs6mQc=       1\n",
       "                                             +MRnGH0Gg7jA7izLFRU1SZtGPmWHdsWTeL9wRXChnRA=       1\n",
       "                                             +Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=       1\n",
       "                                             +s9ez4bdaiqa+RSSGU7ANmC83+f+zS1QtnbnuoDkxlc=       1\n",
       "                                             /6kgyFt1nsLDNbZh0AHM4Pb3sbp3CZ0C4DbPa4++eVE=       1\n",
       "                                             0DQLNH6yvG4LUtNy+ko7mSHx+2vKSEWgWVZrZR0hJeE=       1\n",
       "                                             0Ge/KaQyOshftregia3O17tJVxCneIpU7tgfkWRXFOo=       1\n",
       "                                             0TZ4BKn9YES2upK9ZgKgsAUtUClVKHnct79pnA/iPiQ=       1\n",
       "                                             0cs1Hj5EEWtTIKnL3OQ2G1XhsD+oLfBneiKBRoTHJ10=       1\n",
       "                                             0dkoI2pTRTUQHef2VTAt7d4Pf59fjHtF7PaHA6SMKEk=       1\n",
       "                                             0vBTdkfbCDjBihoqg+UAEIkuFS+6WUmTpgv+iPHP9fY=       1\n",
       "                                             1+hRCbg0uUki4rpsZlxlO8GZYruSPSEnFSsWxjyIBX8=       1\n",
       "                                             156jveBrNsBiAvX9W5/Bhcq330scjrczYYGMjx7C6f8=       1\n",
       "                                             1FWFwb2xgSrjkZ1sH8NFb+H55U/rv0tWH7HCyp3RQVg=       1\n",
       "                                             22wN6a2Q/qVQosA8OBfVg6AbwvI2KJnZ35DXLqpLHJg=       1\n",
       "                                             2HzD9vqNzCXCs1FG7bCLMdVuHhy+dPmA3kR6sxczoHg=       1\n",
       "                                             2MRXrIB8hS/wlIIfCI0zOofjWh00gIPcY7U0nkvt0Do=       1\n",
       "                                             2Y7NstzrTnCFlRel5m80hPDe9471QRU9oyqPIx1xIP0=       1\n",
       "                                             2lHnMx+RWbPYv6NqByyQLM9z+gR7wR2NX1v+xTMO3iY=       1\n",
       "                                             2oNvM+pm2OrgVOR6ogzOWpDFSgM34W2GKyuj9s9cnTU=       1\n",
       "                                             2xyj32lIiLxSjZyDtITN1INnuRoMHDqLf0/DnEFRiXY=       1\n",
       "                                             35HZRjWgSVN2C5jI9dTRRfqnyu3xiuB+Q/BxNZfFgMc=       1\n",
       "                                             35dx60z4m4+Lg+qIS0l2A8vspbthqnpTylWUu51jW+4=       1\n",
       "                                             364mdkfol3/iMgO0iKge+CvE55hKgimQMJ5GrxxwuRU=       1\n",
       "                                             39wBIgWcybJ0LlPeVmApMPmQToJ+iG3VxzQHepCh92A=       1\n",
       "                                             3NLnW15HfIOF+VCDsJ09OTAAl7URq9+rtioJts+l5k0=       1\n",
       "                                             3XsgDUc1X0IprrHaX3IHV/eygRBC9cdP46M00NOgR8s=       1\n",
       "                                             3kevTU3qYXoDjCz3P6Zg6Z70T+qJHy0ogG+6ja+FOxA=       1\n",
       "                                             3o0eoSASXAM5S54TkkkMYHgbg07Km3md+S4b8zjZDT4=       1\n",
       "                                             3tej7/GZU1yjMOpASdP205Zyj5vS/ONWw118umvUQIs=       1\n",
       "...                                                                                           ...\n",
       "zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU= jgR1t7eunTfEmalVEkEl9eV2L2c4Y6ci1+171CF6DEo=       1\n",
       "                                             mXjOkV+pc7d91ctXLz4OgXTB7tEse/eAorkCvxNFHjY=       1\n",
       "                                             nwEcF/fOvuPOOTYKG1h8N6VyDtbgzmyA2G7+N1L4qZ8=       1\n",
       "                                             oDG5WCyUlb9i/Sr34Fljeh0YsONZKxl+7jjUw27vm5U=       1\n",
       "                                             oF03xgIc9Z+hV316VfjIQ+obYwnv35u5YB3PD+6JaJM=       1\n",
       "                                             ofqMCLCMtBo/sFM3e5NR3+lKn1qntdAGhcWGSHvF40o=       1\n",
       "                                             ooic+5zqkzuYoL4Py1JX18I4ZdMnOSFMHTh9+0O07Ao=       1\n",
       "                                             p29OR54+PVxhlcC6Fmuuzoa+jeZKzh/EhJUJf6mvQbA=       1\n",
       "                                             pfiKQT5IoBLAwScq3b1kNpUhj6GZy73yirxu0Hz6Jo4=       1\n",
       "                                             pfnlCKRD1vpfe76haKcTthXKmSTaL2h8vlYDeoGpLWw=       1\n",
       "                                             qIrAqKtDPBACcsAjZWh9OXYSfGYzx8Wwcz5szbX/n+E=       1\n",
       "                                             qm+hOFUCV0HMX1mQpU5nv9zxLh7F7VjULF0KX3yL73M=       1\n",
       "                                             qm1G/0r9ZHsHskGuhIBOqKl1emPSQMAiVARG49LfMLI=       1\n",
       "                                             quxZPlPUmS8QsJcSjHdn6zE1quBWk7w4ii3rLgOaawQ=       1\n",
       "                                             r918Tutfx0ADBiR7FHX0aJFE6IvN6wQBNlaJCCTKA1I=       1\n",
       "                                             rFLawM8j/vW78o/S3P2GTsmy6HwkTI15P7NF/i9CkX4=       1\n",
       "                                             rbDawE1MLdOzQv3xGb9sG7n2Lm6rRq+VAPXLi+/wSX0=       1\n",
       "                                             sbeFHrrr6+2Vwq54A5bBUpFsQ6xkNoDc7fzqKLK1Mo8=       1\n",
       "                                             t1iz44Bys600zwNh4NMFFkwIHmea9UW+nbkN3lxfFo4=       1\n",
       "                                             t2ebdTJR77cxY9u9uDUSZ0UBnpSTwsDjx7nUPptVKr4=       1\n",
       "                                             truNpzPqG5vPHAayapIz8FTnpD2qs+UHS+KGJs6S5yw=       1\n",
       "                                             vUtVON1/U8ktsmq0q1WIlQnYHycT2AmqqpmRfcQx5k0=       1\n",
       "                                             vdbUGlcRq3cB0XvkCYSTlMEVYMKEDlGauRA7iElyxXQ=       1\n",
       "                                             vldg6m5rQExUQNpVq0W/L/0VeDC6ktIFC/u2ezAudO4=       1\n",
       "                                             wECvtbYxQG1OF5plm68OsbPSZnlZTaIEEbB9npniATw=       1\n",
       "                                             wq0/WJsu4L8YBFbn1r5gPVgbx/EcVP0R/GrQOHSvN5g=       1\n",
       "                                             x4R8XJOLjg1x1cvqdCgJXkq0fUOIyU7JQyfzApEZMfs=       1\n",
       "                                             zqDZjACUVfphX2Me6LEbMwDWLXA4bIWCbSSD+QsIypQ=       1\n",
       "zzzRi5ek1YCKTGns8C77xwAutE05PAPmz8T/pIIQhzE= rqNd52MxQ+okfR61j2c7u36DLODO4d2Zt/hMm3jxncM=       1\n",
       "                                             wBTWuHbjdjxnG1lQcbqnK4FddV24rUhuyrYLd9c/hmk=       1\n",
       "\n",
       "[3714656 rows x 1 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_subscriber_msnos.groupby(['msno','song_id']).max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>msno</th>\n",
       "      <th>song_id</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th rowspan=\"30\" valign=\"top\">++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=</th>\n",
       "      <th>+/lcxtBy9FuH0ObLsK9wRf3zl9zSyvDNMpTWSGCAXxc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+d62ngXhdNTJRLKXO8/X9+BBoj77Hs8xVHMLmYGmB4k=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+kkZYh2T9gs8DuRGpLK4O7VJOlt9eUjLDm5qjCk/zOs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>+sS5aZNAn8DtidaMrodHSa/CWObsjuB+L/T0iyjyBZc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>/AdKmEFP8MKMaxub6IrWoQyJIxiYoBHNcftKUE7RWLQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>/e9M5C71xpB0BJAt21BT/b6uHYyZOWs2mlAl5gKaOr8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>/oa0l/3cOth4EizeIL2R7Euv6BZD/Qg9iP1Tn1UsmSo=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0aYIadlcMeSYbRMBxrvku+T8lMdCagsskuGqPJ0zOjc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0oONpePMyVwC4GxAvYrwwsUzTJSsmTmieiRNvJwUGqQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0pKwIZnrNIIhf0CrIl5aXVZ1rVM9TovcGWLSgXzo7Pc=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>0smF3rcYJymevsNGZbPZnTpUGLsgV/DvukNP9+3pZzI=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1/YesP59EuUsjsv0HhrexY3HsaJOlnkys30dq3bRpso=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1IkzmkRE9ew005zhkY/PhJWSyZHb69Mwc3wFQpmxZN4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1aYHCA7p548RLSOAo5iybf5f1mPq6GjWoE64l54gqSg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1uh+3eIKwQRhE+1m9sYi2kaMV1ckoJ7CpUBflh7gIX8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24g//AakngNFUQMk2h+VMQSQ2K6YvVsDNICcxaeK2kk=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2Wg9/GJuKcsjipvjHxEks2UYP0aMMxBsOXQprypCATQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3DHoDXsw9a7T+L/0Cz2+XMiEcdYswVYW/Qwwpse5EeE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3m08z9OJVEcZc1wOleFx6h3vPhwBka5RCI0xZVnWIkY=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3vY6e3CJfrJu7+6I5G7WFxfRalkg/FWwQxdzzMiVjgg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40bz3hEl4p0TcOkcWVP6nSIZUm00IM9gsMYCwYU0nqg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4mR69XA8Si6UQvR7EtiMw6aG1ZDnLsybWjKMbwQWSSE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>507plIkmke1jh3wMrHqKore82pPFozADwydR8P0Gx2Q=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54y9gncBAx4sHzf8A1PCBIIqyXBq6rJFVfOWwv2WL2I=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5J687aBNu0Sy/MwI7RwIZScoy0vRNuxK87WsSqj2xqM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5Qh7YI03x2cbQFFv/e+faYhVpFAzjvnuE3YXkx2KgsQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5bWRmJkEywDNEmvSmRl3bAR64omJD+ZiZvlQuyCKRAs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5q6FRR591P/bIRPM79zoLa6pA5JG3ogGwddbU10aGxw=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6+g6lW+0uwyIykrhF2cpgm1SUUrzyutvT1aCDsIYgjM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6LdT9BiGIlQ3rJped7PACqnrzstFXz8dXD6MmmsA8uI=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th rowspan=\"29\" valign=\"top\">zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=</th>\n",
       "      <th>pCf4HI+z4rQsY9FF79m+Sojnl207qtHqtOWU2VFW1ZE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pkKADcjPLFdP9kX40L8dlmbLWkgUTqUB0LhurVLSTME=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>pkUnauuiX5njQkE+kYKu5jkLY/oX/K70IhXbzW6QL0I=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>q2SY4JSb1B2dlwZsoXUYBykyv/1aEC6zQ0UrKfezW0w=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>q5/Vwifgv0SGdEIyiMU2F0pIXkeCIW3w2xB8n4iMoS0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>qAbeeIWq9VsGPqYygdQpXpUKOn0oDwHcAaO3RPMfMWo=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>r5BnHxvuPlLjwr4MHf3dF3yUY+NVS6rHrkVG7wVZ1uE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>rkxczeqV19eJ+R8d2Kdakq1jXB1Fq+Dgl51l6Be4ZF0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>sl+07AEBL7vriPn3dRkRfTb5gJYWmUSb46LOhK7nzoM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>tQE09rP+6BiTuDQLOOw5ByQnrjxks01CX6hRYixTSr8=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>u3Cep8rYmeaOLq3xdjNHg8iFVXwT5RNArzQj2+VjWGM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>u9z+kvlYCkrSlDdJv3O/t5h+JpZJrSn2ZBI1GWe+EYQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uNnKRjFt1WxQMfpz2tlZAJ/2GxoQIvF+36u4lNKfhEQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uaU3VI7KA2xoQkGkSC4NNOMFmHL9R85kdR7eyY5Pc9M=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ugMqNL45FHkT+YlHpD18DSGoEYdvd4fdFRAYKYXdwe4=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>uwGAfDt1trs7aTxUM5DPYRtalQY/IPszFbRPgayilDQ=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vLTzjFd15Dtsx/iE1Y4CjjHi1+3t25AznG3JFWaE6ZA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vkagi2Lrvjx/FxLWE6FHZFoDoOaSWI052fEQCwouDUs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>vyF0hRUFnmqImkNlokua+YmouIRQR/Nty8Is0xY5BJI=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>wBTWuHbjdjxnG1lQcbqnK4FddV24rUhuyrYLd9c/hmk=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>xizfI29l3nwmPhHu9CD0f7Y9+JUPDhhVFrB5fWTZ0S0=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>xoE4aa1TBDV1bRxk0S0SxIhZKhBy+exPAlcLIptJqaA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>y5Vb0MCcZ+hLKGvptMn0y7eAftKH/RWaNoHSduHo9Hs=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yL8VeuH6lYFA4hYVGX/1HmPogBfp1Ex4ZkL5PjWy11k=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yYs87L2Y0JgUXpPS+lix+NsNpZe8B2XoNI3G/EbN7yM=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yzXG3v5fWuLlgftXsbiLYpPjPE9V9GiKp7y36z7JQZA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>z/+XZF/0QSERMd0wwcvITf4VLUg5CyaaitllethQ1Tg=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zIqxWqaNW+z+9Lgy3/mwgVGEIp8FGKW73juVwZCH8rE=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zK2kMG6yF7AOdKVQfMIPvKyTRynq+ANecPCBJ90IIeA=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>zzzRi5ek1YCKTGns8C77xwAutE05PAPmz8T/pIIQhzE=</th>\n",
       "      <th>PgRtmmESVNtWjoZHO5a1r21vIz9sVZmcJJpFCbRa1LI=</th>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3662762 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                                                           target\n",
       "msno                                         song_id                                             \n",
       "++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU= +/lcxtBy9FuH0ObLsK9wRf3zl9zSyvDNMpTWSGCAXxc=       1\n",
       "                                             +d62ngXhdNTJRLKXO8/X9+BBoj77Hs8xVHMLmYGmB4k=       1\n",
       "                                             +kkZYh2T9gs8DuRGpLK4O7VJOlt9eUjLDm5qjCk/zOs=       1\n",
       "                                             +sS5aZNAn8DtidaMrodHSa/CWObsjuB+L/T0iyjyBZc=       1\n",
       "                                             /AdKmEFP8MKMaxub6IrWoQyJIxiYoBHNcftKUE7RWLQ=       1\n",
       "                                             /e9M5C71xpB0BJAt21BT/b6uHYyZOWs2mlAl5gKaOr8=       1\n",
       "                                             /oa0l/3cOth4EizeIL2R7Euv6BZD/Qg9iP1Tn1UsmSo=       1\n",
       "                                             0aYIadlcMeSYbRMBxrvku+T8lMdCagsskuGqPJ0zOjc=       1\n",
       "                                             0oONpePMyVwC4GxAvYrwwsUzTJSsmTmieiRNvJwUGqQ=       1\n",
       "                                             0pKwIZnrNIIhf0CrIl5aXVZ1rVM9TovcGWLSgXzo7Pc=       1\n",
       "                                             0smF3rcYJymevsNGZbPZnTpUGLsgV/DvukNP9+3pZzI=       1\n",
       "                                             1/YesP59EuUsjsv0HhrexY3HsaJOlnkys30dq3bRpso=       1\n",
       "                                             1IkzmkRE9ew005zhkY/PhJWSyZHb69Mwc3wFQpmxZN4=       1\n",
       "                                             1aYHCA7p548RLSOAo5iybf5f1mPq6GjWoE64l54gqSg=       1\n",
       "                                             1uh+3eIKwQRhE+1m9sYi2kaMV1ckoJ7CpUBflh7gIX8=       1\n",
       "                                             24g//AakngNFUQMk2h+VMQSQ2K6YvVsDNICcxaeK2kk=       1\n",
       "                                             2Wg9/GJuKcsjipvjHxEks2UYP0aMMxBsOXQprypCATQ=       1\n",
       "                                             3DHoDXsw9a7T+L/0Cz2+XMiEcdYswVYW/Qwwpse5EeE=       1\n",
       "                                             3m08z9OJVEcZc1wOleFx6h3vPhwBka5RCI0xZVnWIkY=       1\n",
       "                                             3vY6e3CJfrJu7+6I5G7WFxfRalkg/FWwQxdzzMiVjgg=       1\n",
       "                                             40bz3hEl4p0TcOkcWVP6nSIZUm00IM9gsMYCwYU0nqg=       1\n",
       "                                             4mR69XA8Si6UQvR7EtiMw6aG1ZDnLsybWjKMbwQWSSE=       1\n",
       "                                             507plIkmke1jh3wMrHqKore82pPFozADwydR8P0Gx2Q=       1\n",
       "                                             54y9gncBAx4sHzf8A1PCBIIqyXBq6rJFVfOWwv2WL2I=       1\n",
       "                                             5J687aBNu0Sy/MwI7RwIZScoy0vRNuxK87WsSqj2xqM=       1\n",
       "                                             5Qh7YI03x2cbQFFv/e+faYhVpFAzjvnuE3YXkx2KgsQ=       1\n",
       "                                             5bWRmJkEywDNEmvSmRl3bAR64omJD+ZiZvlQuyCKRAs=       1\n",
       "                                             5q6FRR591P/bIRPM79zoLa6pA5JG3ogGwddbU10aGxw=       1\n",
       "                                             6+g6lW+0uwyIykrhF2cpgm1SUUrzyutvT1aCDsIYgjM=       1\n",
       "                                             6LdT9BiGIlQ3rJped7PACqnrzstFXz8dXD6MmmsA8uI=       1\n",
       "...                                                                                           ...\n",
       "zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU= pCf4HI+z4rQsY9FF79m+Sojnl207qtHqtOWU2VFW1ZE=       1\n",
       "                                             pkKADcjPLFdP9kX40L8dlmbLWkgUTqUB0LhurVLSTME=       1\n",
       "                                             pkUnauuiX5njQkE+kYKu5jkLY/oX/K70IhXbzW6QL0I=       1\n",
       "                                             q2SY4JSb1B2dlwZsoXUYBykyv/1aEC6zQ0UrKfezW0w=       1\n",
       "                                             q5/Vwifgv0SGdEIyiMU2F0pIXkeCIW3w2xB8n4iMoS0=       1\n",
       "                                             qAbeeIWq9VsGPqYygdQpXpUKOn0oDwHcAaO3RPMfMWo=       1\n",
       "                                             r5BnHxvuPlLjwr4MHf3dF3yUY+NVS6rHrkVG7wVZ1uE=       1\n",
       "                                             rkxczeqV19eJ+R8d2Kdakq1jXB1Fq+Dgl51l6Be4ZF0=       1\n",
       "                                             sl+07AEBL7vriPn3dRkRfTb5gJYWmUSb46LOhK7nzoM=       1\n",
       "                                             tQE09rP+6BiTuDQLOOw5ByQnrjxks01CX6hRYixTSr8=       1\n",
       "                                             u3Cep8rYmeaOLq3xdjNHg8iFVXwT5RNArzQj2+VjWGM=       1\n",
       "                                             u9z+kvlYCkrSlDdJv3O/t5h+JpZJrSn2ZBI1GWe+EYQ=       1\n",
       "                                             uNnKRjFt1WxQMfpz2tlZAJ/2GxoQIvF+36u4lNKfhEQ=       1\n",
       "                                             uaU3VI7KA2xoQkGkSC4NNOMFmHL9R85kdR7eyY5Pc9M=       1\n",
       "                                             ugMqNL45FHkT+YlHpD18DSGoEYdvd4fdFRAYKYXdwe4=       1\n",
       "                                             uwGAfDt1trs7aTxUM5DPYRtalQY/IPszFbRPgayilDQ=       1\n",
       "                                             vLTzjFd15Dtsx/iE1Y4CjjHi1+3t25AznG3JFWaE6ZA=       1\n",
       "                                             vkagi2Lrvjx/FxLWE6FHZFoDoOaSWI052fEQCwouDUs=       1\n",
       "                                             vyF0hRUFnmqImkNlokua+YmouIRQR/Nty8Is0xY5BJI=       1\n",
       "                                             wBTWuHbjdjxnG1lQcbqnK4FddV24rUhuyrYLd9c/hmk=       1\n",
       "                                             xizfI29l3nwmPhHu9CD0f7Y9+JUPDhhVFrB5fWTZ0S0=       1\n",
       "                                             xoE4aa1TBDV1bRxk0S0SxIhZKhBy+exPAlcLIptJqaA=       1\n",
       "                                             y5Vb0MCcZ+hLKGvptMn0y7eAftKH/RWaNoHSduHo9Hs=       1\n",
       "                                             yL8VeuH6lYFA4hYVGX/1HmPogBfp1Ex4ZkL5PjWy11k=       1\n",
       "                                             yYs87L2Y0JgUXpPS+lix+NsNpZe8B2XoNI3G/EbN7yM=       1\n",
       "                                             yzXG3v5fWuLlgftXsbiLYpPjPE9V9GiKp7y36z7JQZA=       1\n",
       "                                             z/+XZF/0QSERMd0wwcvITf4VLUg5CyaaitllethQ1Tg=       1\n",
       "                                             zIqxWqaNW+z+9Lgy3/mwgVGEIp8FGKW73juVwZCH8rE=       1\n",
       "                                             zK2kMG6yF7AOdKVQfMIPvKyTRynq+ANecPCBJ90IIeA=       1\n",
       "zzzRi5ek1YCKTGns8C77xwAutE05PAPmz8T/pIIQhzE= PgRtmmESVNtWjoZHO5a1r21vIz9sVZmcJJpFCbRa1LI=       1\n",
       "\n",
       "[3662762 rows x 1 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_unclick_msnos.groupby(['msno','song_id']).max()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "由上可知，用户对每首歌只要0和1状态，且用户--歌曲联合主键不重复"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(7377418, 3)\n",
      "(7377418, 3)\n",
      "(3714656, 3)\n",
      "(3714656, 3)\n",
      "(3662762, 3)\n",
      "(3662762, 3)\n"
     ]
    }
   ],
   "source": [
    "print(df_kkbox.shape)\n",
    "print(df_kkbox.drop_duplicates(subset=['msno','song_id']).shape)\n",
    "\n",
    "print(df_clicked.shape)\n",
    "print(df_clicked.drop_duplicates(subset=['msno','song_id']).shape)\n",
    "\n",
    "print(df_unclick.shape)\n",
    "print(df_unclick.drop_duplicates(subset=['msno','song_id']).shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "msno,song_id,target不重复，所以user-item矩阵只有0-1值。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "del df_clicked\n",
    "del df_unclick"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 构造稀疏打分矩阵"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of Users :30755\n",
      "number of Songs :359966\n"
     ]
    }
   ],
   "source": [
    "from collections import defaultdict\n",
    "import scipy.sparse as ss\n",
    "#保存数据\n",
    "import pickle\n",
    "import scipy.io as sio\n",
    "\n",
    "#所用的用户和item\n",
    "users = list(df_kkbox['msno'].unique())\n",
    "items = list(df_kkbox['song_id'].unique())\n",
    "n_users = len(users)\n",
    "n_items = len(items)\n",
    "\n",
    "print(\"number of Users :%d\" % n_users)\n",
    "print(\"number of Songs :%d\" % n_items)\n",
    "\n",
    "#倒排表\n",
    "#统计每个用户播放过的歌曲   / 播放每个歌曲的用户\n",
    "user_items = defaultdict(set)\n",
    "item_users = defaultdict(set)\n",
    "\n",
    "#用户-物品关系矩阵表，稀疏矩阵，\n",
    "user_item_scores = ss.dok_matrix((n_users, n_items))\n",
    "\n",
    "#重新编码用户索引字典\n",
    "users_index = dict()\n",
    "items_index = dict()\n",
    "for i, u in enumerate(users):\n",
    "    users_index[u] = i\n",
    "\n",
    "\n",
    "#重新编码歌曲索引字典    \n",
    "for i, e in enumerate(items):\n",
    "    items_index[e] = i\n",
    "\n",
    "n_records = df_kkbox.shape[0]\n",
    "for i in range(n_records):\n",
    "    data = df_kkbox.iloc[i]\n",
    "    user_index_i = users_index[data['msno']] #用户\n",
    "    item_index_i = items_index[data['song_id'] ]#歌曲\n",
    "    \n",
    "    user_items[user_index_i].add(item_index_i)    #该用户的歌曲\n",
    "    item_users[item_index_i].add(user_index_i)    #播放该歌曲的用户\n",
    "        \n",
    "    user_item_scores[user_index_i, item_index_i] = data['target'] #订阅与否作为分数\n",
    "\n",
    "#倒排表\n",
    "pickle.dump(user_items, open(\"../Data/user_items.pkl\", 'wb'))\n",
    "pickle.dump(item_users, open(\"../Data/item_users.pkl\", 'wb'))\n",
    "\n",
    "#保存用户-物品关系矩阵R，以备后用\n",
    "sio.mmwrite(\"../Data/user_item_scores\", user_item_scores)\n",
    "\n",
    "\n",
    "#保存用户索引表\n",
    "pickle.dump(users_index, open(\"../Data/users_index.pkl\", 'wb'))\n",
    "#保存活动索引表\n",
    "pickle.dump(items_index, open(\"../Data/items_index.pkl\", 'wb'))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
